A demo for SUMO Submission evaluation package

source("../GAA-EVAL.R")
## Warning: package 'ggplot2' was built under R version 3.4.4
## Warning: package 'dplyr' was built under R version 3.4.4
## Warning: package 'pROC' was built under R version 3.4.4
## Warning: package 'reshape2' was built under R version 3.4.3
## Warning: package 'ggrepel' was built under R version 3.4.4
## Warning: package 'gmodels' was built under R version 3.4.4
## Warning: package 'stringr' was built under R version 3.4.4

Read in the Experimental data provided by CAGI

exp.data <- read.RealData(file = "exp_data.csv", sep = "\t",
                             col.id = 1, col.value = 2, col.sd = 3,na.character = 'NA')
length(exp.data$value)
exp.data.s <- read.RealData(file = "exp_data_s.csv", sep = ",",
                             col.id = 1, col.value = 2, col.sd = 3,na.character = 'NA')
head(exp.data.s$value)

Read in the submission folders

sub.data <- read.Submission.Folder(folder.name = "prediction/",col.id = 1,
                                      col.value = 2, col.sd = 3, real.data = exp.data)
sub.data.s = read.Submission.Folder(folder.name = "prediction/",col.id = 1,
                                      col.value = 2, col.sd = 3, real.data = exp.data.s)
sub.data = addGroup(sub.data,c(1,1,2,3,3,4,4,5,6,6,6,6,7,8,9,9))
sub.data.s = addGroup(sub.data.s,c(1,1,2,3,3,4,4,5,6,6,6,6,7,8,9,9))

ScatterPlot inspection

plot_all_scatter(real.data = exp.data, pred.data = sub.data, z.transform = TRUE)
## Warning: Removed 604 rows containing missing values (geom_point).
## Warning: Removed 5109 rows containing missing values (geom_segment).
## Warning: Removed 604 rows containing missing values (geom_point).
## Warning: Removed 5109 rows containing missing values (geom_segment).
## Warning: Removed 4894 rows containing missing values (geom_point).
## Warning: Removed 5109 rows containing missing values (geom_segment).
## Warning: Removed 604 rows containing missing values (geom_point).
## Warning: Removed 5109 rows containing missing values (geom_segment).
## Warning: Removed 604 rows containing missing values (geom_point).
## Warning: Removed 5109 rows containing missing values (geom_segment).
## Warning: Removed 4894 rows containing missing values (geom_point).
## Warning: Removed 5109 rows containing missing values (geom_segment).
## Warning: Removed 4894 rows containing missing values (geom_point).
## Warning: Removed 5109 rows containing missing values (geom_segment).
## Warning: Removed 604 rows containing missing values (geom_point).
## Warning: Removed 5109 rows containing missing values (geom_segment).
## Warning: Removed 604 rows containing missing values (geom_point).
## Warning: Removed 5109 rows containing missing values (geom_segment).
## Warning: Removed 604 rows containing missing values (geom_point).
## Warning: Removed 5109 rows containing missing values (geom_segment).
## Warning: Removed 604 rows containing missing values (geom_point).
## Warning: Removed 5109 rows containing missing values (geom_segment).
## Warning: Removed 604 rows containing missing values (geom_point).
## Warning: Removed 5109 rows containing missing values (geom_segment).
## Warning: Removed 4484 rows containing missing values (geom_point).
## Warning: Removed 5109 rows containing missing values (geom_segment).
## Warning: Removed 604 rows containing missing values (geom_point).
## Warning: Removed 5109 rows containing missing values (geom_segment).
## Warning: Removed 604 rows containing missing values (geom_point).
## Warning: Removed 5109 rows containing missing values (geom_segment).
## Warning: Removed 604 rows containing missing values (geom_point).
## Warning: Removed 5109 rows containing missing values (geom_segment).

Correlation-based Evaluation without/with bootstrap

without bootstrap method = “pearson”: In this example, we use pearson correlation sd.use = 0.3: experimental value with sd larger than 0.3 is filtered out z.transformation does not have effect to pearson correlation

# 1. Render coefficient value
result.cor.pearson <- eval.Correlation(real.data = exp.data.s, pred.data = sub.data.s,
                                       method = "pearson", sd.use = 0.3,z.transform = TRUE)
head(result.cor.pearson)
##                            pearson.coefficient.n=205.sd<0.3      p.value
## Group_39-prediction_file-1                        0.3572199 1.457490e-07
## Group_39-prediction_file-2                        0.3540692 1.906731e-07
## Group_40-prediction_file-1                        0.2995061 1.285132e-05
## Group_41-prediction_file-1                        0.3794999 2.003074e-08
## Group_41-prediction_file-2                        0.3794999 2.003074e-08
## Group_42-prediction_file-1                        0.1345214 5.447680e-02
# 2. Plot Correlation
plot.Correlation(result.cor.pearson, "Pearson")

Bootstrap using row replacement (rep.time = 500 as a fixed input for now) boot = T, boot.var = F

# 1. Render coefficient value
boot.result.cor.pearson <- eval.Correlation(real.data = exp.data.s, pred.data = sub.data.s,
                                       method = "pearson", sd.use = 0.3,z.transform = TRUE,boot = T)
# For Correlation-based Evaluation, provide mean, CI, and median pval
head(boot.result.cor.pearson)
##                                  avg      low_ci   high_ci         sd
## Group_39-prediction_file-1 0.3584616 0.248757113 0.4650019 0.06693278
## Group_39-prediction_file-2 0.3551402 0.247152353 0.4670083 0.06645212
## Group_40-prediction_file-1 0.3035865 0.203474870 0.4011387 0.06101202
## Group_41-prediction_file-1 0.3814340 0.299085806 0.4588648 0.04934251
## Group_41-prediction_file-2 0.3814340 0.299085806 0.4588648 0.04934251
## Group_42-prediction_file-1 0.1300704 0.008814297 0.2653382 0.07935949
##                                 p.value
## Group_39-prediction_file-1 1.308500e-07
## Group_39-prediction_file-2 1.719168e-07
## Group_40-prediction_file-1 1.016501e-05
## Group_41-prediction_file-1 1.373425e-08
## Group_41-prediction_file-2 1.373425e-08
## Group_42-prediction_file-1 7.654748e-02
# 2. Plot Correlation
plot.Correlation(boot.result.cor.pearson, "Pearson",boot = TRUE)

Bootstrap using distribution modelling (rep.time = 500 as a fixed input for now) boot = T, boot.var = T

bootvar.result.cor.pearson <- eval.Correlation(real.data = exp.data.s, pred.data = sub.data.s,
                                       method = "pearson", sd.use = 0.3,z.transform = TRUE,boot = T,boot.var = T )
head(bootvar.result.cor.pearson)
##                                  avg    low_ci   high_ci          sd
## Group_39-prediction_file-1 0.3547601 0.3412656 0.3692032 0.008070335
## Group_39-prediction_file-2 0.3515426 0.3375549 0.3655065 0.008193730
## Group_40-prediction_file-1 0.2973147 0.2833942 0.3098228 0.007979621
## Group_41-prediction_file-1 0.3772570 0.3641273 0.3895295 0.007732230
## Group_41-prediction_file-2 0.3772570 0.3641273 0.3895295 0.007732230
## Group_42-prediction_file-1 0.1335184 0.1191248 0.1464837 0.008738655
##                                 p.value
## Group_39-prediction_file-1 1.774906e-07
## Group_39-prediction_file-2 2.367728e-07
## Group_40-prediction_file-1 1.471955e-05
## Group_41-prediction_file-1 2.475738e-08
## Group_41-prediction_file-2 2.475738e-08
## Group_42-prediction_file-1 5.595699e-02
plot.Correlation(bootvar.result.cor.pearson, "Pearson",boot = TRUE)

RMSD-based Evaluation

  • (for demonstration, only present the bootstrapped result)
  • variance.normalization: defined on PPT
  • density.distance: RMSD based on distribution model of experimental value
# without variance.normalization + without density.distance
boot.result.rmsd4 <- eval.RMSD(real.data = exp.data.s, pred.data = sub.data.s,sd.use = NA, 
                      density.distance = FALSE,variance.normalization = FALSE,boot = TRUE)
head(boot.result.rmsd4)
##                                 RMSD    low_ci   high_ci         sd
## Group_39-prediction_file-1 0.7582194 0.6948829 0.8238805 0.03795513
## Group_39-prediction_file-2 0.7236246 0.6616226 0.7896134 0.03916140
## Group_40-prediction_file-1 0.6193394 0.5589046 0.6751160 0.03625268
## Group_41-prediction_file-1 0.5763456 0.5352569 0.6192372 0.02490414
## Group_41-prediction_file-2 0.5763456 0.5352569 0.6192372 0.02490414
## Group_42-prediction_file-1 0.6400337 0.5800956 0.7060908 0.03955404
plot.RMSD(boot.result.rmsd4, method="",boot = TRUE)

# with variance.normalization + without density.distance
boot.result.rmsd2 <- eval.RMSD(real.data = exp.data.s, pred.data = sub.data.s,sd.use = NA, 
                      density.distance = FALSE,variance.normalization = TRUE,boot = TRUE)
head(boot.result.rmsd2)
##                                RMSD    low_ci  high_ci         sd
## Group_39-prediction_file-1 1.096020 0.9974707 1.201784 0.06331646
## Group_39-prediction_file-2 1.018759 0.9262273 1.111976 0.05627738
## Group_40-prediction_file-1 1.141237 1.0454992 1.243660 0.05919249
## Group_41-prediction_file-1 1.581274 1.4056522 1.775854 0.11120052
## Group_41-prediction_file-2 1.581274 1.4056522 1.775854 0.11120052
## Group_42-prediction_file-1 1.620394 1.4547180 1.774669 0.10079146
plot.RMSD(boot.result.rmsd2, method="",boot = TRUE)

# with variance.normalization + with density.distance
boot.result.rmsd <- eval.RMSD(real.data = exp.data.s, pred.data = sub.data.s,sd.use = NA, 
                      density.distance = TRUE, variance.normalization = TRUE, boot = TRUE)
head(boot.result.rmsd)
##                                 RMSD    low_ci  high_ci         sd
## Group_39-prediction_file-1 1.0730166 0.9615569 1.181177 0.06564373
## Group_39-prediction_file-2 0.9985357 0.9006786 1.089820 0.05708015
## Group_40-prediction_file-1 1.1270789 1.0319753 1.223254 0.05692394
## Group_41-prediction_file-1 1.5582421 1.3888112 1.744497 0.11175008
## Group_41-prediction_file-2 1.5582421 1.3888112 1.744497 0.11175008
## Group_42-prediction_file-1 1.6294310 1.4715430 1.777417 0.09680995
plot.RMSD(boot.result.rmsd, method="",boot = TRUE)

Cut-off-based Evaluation

threshold = 0.6: The cutoff for positive/negative value

result.auc.0.6 <- eval.AUC(real.data =exp.data.s, pred.data = sub.data.s, 
                           threshold = 0.6)
head(result.auc.0.6$results)
##                            AUC.0.6 sensitivity specificity accuracy
## Group_39-prediction_file-1  0.6885      0.7931      0.4844   0.6093
## Group_39-prediction_file-2  0.6865      0.7241      0.5703   0.6326
## Group_40-prediction_file-1  0.6929      0.5057      0.7734   0.6651
## Group_41-prediction_file-1  0.6739      0.9540      0.3438   0.5907
## Group_41-prediction_file-2  0.6739      0.9540      0.3438   0.5907
## Group_42-prediction_file-1  0.5975      0.1839      0.8828   0.6000
##                            precision f1_score  bAccu
## Group_39-prediction_file-1    0.5111   0.6216 0.6387
## Group_39-prediction_file-2    0.5339   0.6146 0.6472
## Group_40-prediction_file-1    0.6027   0.5500 0.6396
## Group_41-prediction_file-1    0.4970   0.6535 0.6489
## Group_41-prediction_file-2    0.4970   0.6535 0.6489
## Group_42-prediction_file-1    0.5161   0.2712 0.5334
plot.AUC(result.auc.0.6)

Between-method Evaluation

# for all the submission files
result.bM.spearman <-eval.Correlation.Between(real.data = exp.data.s, pred.data = sub.data.s,
                                              method = "spearman",sd.use = NA,z.transform = TRUE)
plot.Correlation.Between(result.bM.spearman$coefficient, method="spearman")

# for best submission of each group
result.bM.pearson <-eval.Correlation.Between(real.data = exp.data.s, pred.data = sub.data.s,
                                              method = "pearson",sd.use = NA,z.transform = TRUE,grouped = TRUE)
## Warning: package 'bindrcpp' was built under R version 3.4.4
plot.Correlation.Between(result.bM.pearson$coefficient, method="pearson")

Partial-Correlation Evaluation (controlling the covariates)

result.pCor <- eval.Partial.Correlation(real.data = exp.data.s, pred.data = sub.data.s, method = "spearman")
## Loading required package: ppcor
## Loading required package: MASS
## Warning: package 'MASS' was built under R version 3.4.4
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
## Warning in pcor(mat, method = "spearman", ...): The inverse of variance-
## covariance matrix is calculated using Moore-Penrose generalized matrix
## invers due to its determinant of zero.
## Warning in sqrt((n - 2 - gp)/(1 - pcor^2)): NaNs produced
plot.Correlation.Between(result.bM.spearman$coefficient, method="Spearman")

PCA Plot

total = cbind(real = exp.data.s$value,sub.data.s$value)
Plot.PCA(na.omit(total), labels=F, legend=TRUE) 

Uniqueness Evaluation

uniqueness as adj.r^2 difference between total linear model and linear models without certain group use.ci = T : if true error bar as confidence interval; if false, error bar as standard deviation

# resampling row
result.uniq = eval.uniqueness(real.data = exp.data.s, pred.data = sub.data.s,boot = TRUE)
result.uniq
##                                  uniqueness       low_ci    high_ci
## Group_39-prediction_file-1      0.001239471 -0.003921640 0.01448081
## Group_40-prediction_file-1      0.002164771 -0.003914487 0.01914733
## Group_41-prediction_file-1      0.010086041 -0.003396465 0.03346761
## Group_42-prediction_file-1      0.003068687 -0.003855298 0.02100289
## Group_43-prediction_file-1      0.006215759 -0.003630618 0.02331629
## Group_44-prediction_file-1      0.005379891 -0.003879752 0.02515186
## Group_45-prediction_file-1      0.001870676 -0.003901157 0.01553485
## Group_46-prediction_file-1-late 0.003285608 -0.003858204 0.01936809
## Group_47-prediction_file-1      0.006514064 -0.003738604 0.02772943
##                                          sd
## Group_39-prediction_file-1      0.006207335
## Group_40-prediction_file-1      0.007790426
## Group_41-prediction_file-1      0.011968472
## Group_42-prediction_file-1      0.009212072
## Group_43-prediction_file-1      0.010268351
## Group_44-prediction_file-1      0.010685242
## Group_45-prediction_file-1      0.006824955
## Group_46-prediction_file-1-late 0.008352844
## Group_47-prediction_file-1      0.010923985
plot.uniqueness(result.uniq, method="", boot = TRUE,use.ci = T)

plot.uniqueness(result.uniq, method="", boot = TRUE,use.ci = F)

# generating from experimental distribution
result.bootvar.uniq = eval.uniqueness(real.data = exp.data.s, pred.data = sub.data.s,boot = TRUE,boot.var = T)
## Warning in rnorm(rep.time, real.data$value[x], real.data$sd[x]): NAs
## produced

## Warning in rnorm(rep.time, real.data$value[x], real.data$sd[x]): NAs
## produced

## Warning in rnorm(rep.time, real.data$value[x], real.data$sd[x]): NAs
## produced

## Warning in rnorm(rep.time, real.data$value[x], real.data$sd[x]): NAs
## produced

## Warning in rnorm(rep.time, real.data$value[x], real.data$sd[x]): NAs
## produced

## Warning in rnorm(rep.time, real.data$value[x], real.data$sd[x]): NAs
## produced
result.bootvar.uniq
##                                    uniqueness        low_ci       high_ci
## Group_39-prediction_file-1      -0.0027158048 -0.0039623391 -0.0005103729
## Group_40-prediction_file-1      -0.0018109820 -0.0037960328  0.0010033611
## Group_41-prediction_file-1       0.0068249980  0.0020283647  0.0123611926
## Group_42-prediction_file-1       0.0003406908 -0.0024782399  0.0047476006
## Group_43-prediction_file-1       0.0038366420 -0.0001059559  0.0080785405
## Group_44-prediction_file-1       0.0019369571 -0.0016295498  0.0059734041
## Group_45-prediction_file-1      -0.0005945561 -0.0029082755  0.0029420296
## Group_46-prediction_file-1-late  0.0025801581 -0.0016677633  0.0078707958
## Group_47-prediction_file-1       0.0041009320 -0.0011966605  0.0116263239
##                                          sd
## Group_39-prediction_file-1      0.001219032
## Group_40-prediction_file-1      0.001545033
## Group_41-prediction_file-1      0.003089869
## Group_42-prediction_file-1      0.002150395
## Group_43-prediction_file-1      0.002562105
## Group_44-prediction_file-1      0.002378842
## Group_45-prediction_file-1      0.001810595
## Group_46-prediction_file-1-late 0.002978781
## Group_47-prediction_file-1      0.003865104
plot.uniqueness(result.bootvar.uniq, method="",boot = TRUE,use.ci = F)